The purpose of this workflow is to explore the effect of two human Klotho variants on gene expression in mice. It follows directly from 1.Klotho_Initial_Data_Visualization.Rmd and depends on output from that workflow.
rm(list = ls())
library(here)
fdr.thresh = 0.1 #fdr for differentially expressed genes
args <- commandArgs(trailingOnly=T)
subgroup <- args[2]
if(is.na(subgroup)){
#subgroup <- "all ages"
subgroup <- list("age_batch" = 12)
#subgroup <- list("age_batch" = 4)
#subgroup <- list("age_batch" = 4, "sex_climb" = "Female") #example with more than one filter
}
if(subgroup == "all ages"){
results.dir <- here("Results", "all")
}else{
subgroup.results <- paste(sapply(1:length(subgroup),
function(x) paste(names(subgroup)[x], subgroup[[x]], sep = "_")),
collapse = "_")
results.dir <- here("Results", subgroup.results)
}
general.data.dir <- here("Data", "general")
The subgroup analyzed in this workflow is: age_batch = 12
all.fun <- list.files(here("Code"), full.names = TRUE)
for(i in 1:length(all.fun)){source(all.fun[i])}
processed.data.dir <- file.path(results.dir, "processed_data")
general.processed.data.dir <- here("Results", "Processed_Data")
mouse.data.dir <- here("Data", "Mouse")
geno.cols <- get_geno_cols(here("Data", "general", "geno_cols.csv"))
needed.libraries <- c("pheatmap", "vioplot", "RColorBrewer",
"gprofiler2", "cluster", "pathview", "clusterProfiler", "stringr", "igraph", "wordcloud",
"wordcloud2")
load_libraries(needed.libraries)
mouse.info <- read.csv(file.path(mouse.data.dir, "metadata_validated.csv"))
orthos <- read.delim(file.path(general.data.dir, "human.mouse.orthologs.txt"))
mouse.genes <- read.delim(here("Data", "general", "mouse_gene_info.txt"))
covar.table <- read.table(file.path(processed.data.dir, "mouse_info.csv"), sep = ",", header = TRUE, row.names = 1)
raw.expr <- read.table(here("Data", "Mouse", "rsem.merged.gene_counts.tsv"), header = TRUE)
norm.expr <- readRDS(file.path(processed.data.dir, "Normalized_Expression.RDS"))
scaled.expr <- readRDS(file.path(processed.data.dir, "Scaled_Expression.RDS"))
#gene lists for KEGG, GO, Biodomains, and the intersections
bd.list <- readRDS(file.path(general.processed.data.dir, "Mouse_Biodomains_for_GSEA.RDS"))
sub.bd.list <- readRDS(file.path(general.processed.data.dir, "Mouse_Subdomains_for_GSEA.RDS"))
bd.go.list <- readRDS(file.path(general.processed.data.dir, "Mouse_Biodomains_sub_GO_for_GSEA.RDS"))
kegg.list <- readRDS(file.path(general.processed.data.dir, "Mouse_KEGG_for_GSEA.RDS"))
kegg.bd.list <- readRDS(file.path(general.processed.data.dir, "Mouse_KEGG_Intersections_for_GSEA.RDS"))
We adjusted the expression by non-age covariates.
dummy.covar <- dummy_covar(covar.table[,c("sequencingBatch", "sex_ge")])
adj_expr <- adjust(t(scaled.expr), dummy.covar)
We made a set of matrices for which rows were KEGG-Biodomain intersections and columns were individual mice. The entry in each cell is the mean expression value of those genes for that mouse. (Initially, we made one matrix for each KEGG pathway. In each, the rows specify the biodomains.)
#This function returns mean gene expression values for a vector of genes IDs
#given the expression mat. The expression matrix should have genes in columns
#and individuals in rows
gene_mean_mat <- function(geneID, expr.mat){
common.genes <- intersect(geneID, colnames(expr.mat))
if(length(common.genes) > 0){
gene.means <- rowMeans(expr.mat[,common.genes,drop=FALSE])
}else{
gene.means <- NULL
}
return(gene.means)
}
#This function returns individual means for a non-nested
#list of genes, like in bd.list
non_nested_mean_vals <- function(non_nested_gene_list, expr.mat){
mean.mat <- matrix(NA, nrow = length(non_nested_gene_list), ncol = nrow(expr.mat))
rownames(mean.mat) <- names(non_nested_gene_list)
colnames(mean.mat) <- rownames(expr.mat)
for(i in 1:length(non_nested_gene_list)){
gene.means <- gene_mean_mat(non_nested_gene_list[[i]], expr.mat)
if(length(gene.means) > 0){
mean.mat[i,] <- gene.means
}
}
return(mean.mat)
}
#This function creates matrices of mean gene values for individual
#mice for a nested gene list, like kegg.bd.list
#organization lets you toggle between having the rows of each
#matrix be the outer level or the inner level
nested_mean_vals <- function(nested_gene_list, expr.mat, organization = c("outer", "inner")){
organization <- organization[1]
#the outer level makes up the rows of each matrix (i.e. biodomains)
if(organization == "outer"){
u_inner <- unique(unlist(lapply(nested_gene_list, names)))
mean.gene.list <- vector(mode = "list", length = length(u_inner))
names(mean.gene.list) <- u_inner
#get the gene list for each inner
for(i in 1:length(u_inner)){
i.gene.idx <- lapply(nested_gene_list, function(x) which(names(x) == u_inner[i]))
i.gene.list <- lapply(1:length(i.gene.idx),
function(x) if(length(i.gene.idx[[x]]) > 0){nested_gene_list[[x]][[i.gene.idx[[x]]]]})
names(i.gene.list) <- names(i.gene.idx)
i.gene.vals <- non_nested_mean_vals(non_nested_gene_list = i.gene.list, expr.mat)
#pheatmap(i.gene.vals)
mean.gene.list[[i]] <- i.gene.vals
}
}
if(organization == "inner"){
u_outer <- names(nested_gene_list)
mean.gene.list <- vector(mode = "list", length = length(u_outer))
for(o in 1:length(u_outer)){
if(length(nested_gene_list[[o]]) > 0){
o.gene.vals <- non_nested_mean_vals(non_nested_gene_list = nested_gene_list[[o]], expr.mat)
mean.gene.list[[o]] <- o.gene.vals
}
}
}
return(mean.gene.list)
}
plot_mean_bd_k <- function(mean.bd.k.mat, covar.mat, plot.label = "",
row.names = rownames(mean.bd.k.mat), label.margin = 12,
order.by = c("PC", "top.rank"), plot.results = TRUE,
global.min = NULL, global.max = NULL){
order.by <- order.by[1]
sub.decomp <- NULL
pc.r2 <- NULL
min.val <- global.min
max.val <- global.max
if(is.null(min.val)){
min.val <- min(mean.bd.k.mat, na.rm = TRUE)
}
if(is.null(max.val)){
max.val <- max(mean.bd.k.mat, na.rm = TRUE)
}
neg.vals <- length(which(mean.bd.k.mat < 0))
pos.vals <- length(which(mean.bd.k.mat > 0))
if(neg.vals > 0 && pos.vals > 0){
col.scale = c("blue", "brown")
grad.dir = "ends"
split.at.vals = TRUE
}
if(neg.vals > 0 && pos.vals == 0){
col.scale = "blue"
grad.dir = "low"
split.at.vals = TRUE
}
if(neg.vals == 0 && pos.vals > 0){
col.scale = "brown"
grad.dir = "high"
split.at.vals = TRUE
}
has.vals <- which(!is.na(rowMeans(mean.bd.k.mat)))
if(length(has.vals) > 0){
sub.mat <- mean.bd.k.mat[has.vals,,drop=FALSE]
geno.names <- lapply(names(geno.cols), function(x) rownames(covar.mat)[which(covar.mat[,"climb_geno"] == x)])
cols <- lapply(1:length(geno.names), function(x) rep(geno.cols[x], length(geno.names[[x]])))
col.table <- cbind(unlist(geno.names), unlist(cols))
col.mat <- sapply(colnames(mean.bd.k.mat), function(x) col.table[match(x, col.table[,1]),2])
if(length(has.vals) > 1){
sub.decomp <- plot.decomp(t(sub.mat), cols = col.mat, main = "Decomposition",
cex = 2, plot.result = FALSE)
pc.model <- lm(sub.decomp$u[,1]~as.factor(col.table[,2]))
#pc.vals <- lapply(levels(as.factor(col.table[,2])), function(x) sub.decomp$u[which(col.table[,2] == x),1])
#val.order <- order(sapply(pc.vals, mean))
#boxplot(pc.vals[val.order]);abline(h = 0)
#plot.with.model(sub.decomp$u[,1], mean.bd.k.mat["Structural Stabilization",], col = col.table[,2])
pc.r2 <- summary(pc.model)$adj.r.squared
}else{
order.by = "top.rank"
}
test <- apply(mean.bd.k.mat, 1, function(x) if(!all(is.na(x))){lm(x~as.factor(covar.mat[,"climb_geno"]))}else{NA})
ind.r2 <- sapply(test, function(x) if(length(x) > 1){summary(x)$adj.r.squared}else{NA})
names(ind.r2) <- rownames(mean.bd.k.mat)
r2.order <- order(ind.r2)
if(order.by == "top.rank"){
col.order <- order(mean.bd.k.mat[which.max(ind.r2),])
}else{
col.order <- order(colMeans(mean.bd.k.mat, na.rm = TRUE))
}
if(plot.results){
#layout.mat <- matrix(c(1,1,1,1,2,1,1,1,1,2,5,5,5,5,0,0,0,0,4,4,3,3,3,4,4,3,3,3,4,4), ncol = 6, byrow = FALSE)
layout.mat <- matrix(c(1,1,1,1,2,1,1,1,1,2,5,5,5,5,0,6,6,6,7,7,6,6,6,4,4,3,3,3,4,4,3,3,3,4,4), ncol = 7, byrow = FALSE)
layout(layout.mat, widths = c(1, 1, 0.5, 0.5, 0.5, 0.5))
#pane 1 is the heat map for individual values
par(mar = c(0,label.margin,2,0))
imageWithText(mean.bd.k.mat[rev(r2.order),col.order,drop=FALSE], show.text = FALSE,
col.names = NULL, row.text.shift = 0.015, col.scale = col.scale, grad.dir = grad.dir,
split.at.vals = split.at.vals, color.fun = "linear",
light.dark = "f", global.color.scale = TRUE, global.min = min.val,
global.max = max.val, row.names = rev(row.names[r2.order]))
mtext(plot.label, side = 3, line = 0)
plot.dim <- par("usr")
color.coord <- segment_region(plot.dim[1], plot.dim[2], length(col.mat))
coord.dist <- apply(consec_pairs(color.coord), 1, function(x) x[2] - x[1])/2
#pane 2 is color bars for individual gentotypes
par(mar = c(2,label.margin,0,0))
plot.new()
plot.window(xlim = c(plot.dim[1], plot.dim[2]), ylim = c(0,1))
for(i in 1:length(col.mat)){
draw.rectangle(color.coord[i]-coord.dist[1], color.coord[i]+coord.dist[1],
0, 1, fill = col.mat[col.order[i]], border = NA)
}
#pane 3 is the decomposition
par(mar = c(4,4,4,2))
if(length(has.vals) > 1){
sub.decomp <- plot.decomp(t(sub.mat), cols = col.mat, main = "Decomposition", cex = 2)
pc.order <- order(sub.decomp$u[,1])
#pane 4 is the loadings on the matrix rows colored by genotype
par(mar = c(2,2,0,2))
barplot(sub.decomp$u[pc.order,1], col = col.mat[pc.order])
mtext("PC1 by genotype", side = 3, line = -1.5)
text(nrow(sub.decomp$u)*0.6, -0.1, paste("R^2 =", signif(pc.r2, 2)))
#plot.decomp(sub.mat, label.points = TRUE)
}else{
plot.text("Not enough rows to decompose")
plot.text("Not enough rows to decompose")
}
#pane 5 is the bar plot of r2 for each row
#add the adjusted R2 from the model
par(mar = c(0,0,2,2))
barplot(ind.r2[r2.order], horiz = TRUE, las = 2, xlim = c(0, 0.4), names = NA)
plot.dim <- par("usr")
segments(x0 = 0, y0 = plot.dim[3], y1 = plot.dim[4])
#add grid lines
segments(x0 = seq(0, 0.4, 0.1), y0 = plot.dim[3], y1 = plot.dim[4],
lty = 2, col = "darkgray")
#pane 6 is the violin plots
#add violin plot of PC vals for highest row of the matrix
#or the overall PC values depending on the column order
par(mar = c(4,0,4,2))
genos <- levels(as.factor(covar.table[,"climb_geno"]))
if(order.by == "top.rank"){
max.idx <- which.max(ind.r2)
row.vals <- lapply(genos,
function(x) mean.bd.k.mat[max.idx,which(covar.table[,"climb_geno"] == x)])
plot.label <- names(max.idx)
}else{
row.vals <- lapply(genos,
function(x) sub.decomp$u[which(covar.table[,"climb_geno"] == x),1])
plot.label = "PC1"
}
mean.order <- order(sapply(row.vals, mean))
vioplot(row.vals[mean.order], col = geno.cols[genos[mean.order]],
names = genos[mean.order], las = 2, main = plot.label)
stripchart(row.vals[mean.order], add = TRUE, col = "#dd1c77",
vertical = TRUE, pch = 16, method = "jitter")
plot.dim <- par("usr")
segments(x0 = plot.dim[1], x1 = plot.dim[2], y0 = 0)
#pane 7 is a legend for the genotype colors
plot.new()
plot.window(xlim = c(0,1), ylim = c(0,1))
legend(0, 0.9, fill = geno.cols, legend = names(geno.cols))
}
}#end case for no values
result <- list("decomp" = sub.decomp, "row.r2" = ind.r2, "overall.r2" = pc.r2)
invisible(result)
}
merge_mean_mats <- function(mat_list){
sub.mats <- Reduce("rbind", mat_list)
inner.names <- lapply(mat_list, rownames)
outer.names <- lapply(1:length(inner.names), function(x) rep(names(inner.names)[x], length(inner.names[[x]])))
name.mat <- cbind(unlist(outer.names), unlist(inner.names))
mat.labels <- apply(name.mat, 1, function(x) paste(x, collapse = " : "))
rownames(sub.mats) <- mat.labels
sub.vals <- sub.mats[which(!is.na(rowMeans(sub.mats))),]
return(sub.vals)
}
gene_vioplot <- function(gene.name){
gene.id <- mouse.genes[which(mouse.genes[,"external_gene_name"] == gene.name),"ensembl_gene_id"]
geno.expr <- lapply(geno.idx, function(x) adj_expr[x,gene.id])
model <- lm(adj_expr[,gene.id]~as.factor(covar.table[,"climb_geno"]))
f <- summary(model)$fstatistic
p <- pf(f[1],f[2],f[3],lower.tail=F)
geno.order <- order(sapply(geno.expr, mean))
vioplot(geno.expr[geno.order], col = geno.cols[geno.order],
main = paste(gene.name, "\np =", signif(p, 2)))
stripchart(geno.expr[geno.order], method = "jitter", vertical = TRUE,
pch = 16, col = "#dd1c77", add = TRUE)
abline(h = 0)
}
#if inner.term is NULL, a matrix is assumed.
#if it is specified, a list is assumed
#mean.term.vals is a matrix of mean values like mean.k.vals
#or a list, like mean.bd.k.vals, in which case, inner.term
#needs to be specified.
term_vioplot <- function(outer.term, inner.term = NULL, mean.term.vals){
if(is.null(inner.term)){
term.idx <- which(rownames(mean.term.vals) == outer.term)
if(length(term.idx) == 0){stop(paste("I can't find", outer.term))}
term.expr <- lapply(geno.idx, function(x) mean.term.vals[term.idx,x])
plot.label <- outer.term
}else{
#otherwise we have a list
outer.idx <- which(names(mean.term.vals) == outer.term)
if(length(outer.idx) == 0){stop(paste("I can't find", outer.term))}
inner.idx <- which(rownames(mean.term.vals[[outer.idx]]) == inner.term)
if(length(inner.idx) == 0){stop(paste("I can't find", inner.term))}
term.expr <- lapply(geno.idx, function(x) mean.term.vals[[outer.idx]][inner.idx,x])
plot.label <- paste(outer.term, inner.term, sep = " : ")
}
model <- lm(unlist(term.expr)~as.factor(covar.table[unlist(geno.idx),"climb_geno"]))
f <- summary(model)$fstatistic
p <- pf(f[1],f[2],f[3],lower.tail=F)
geno.order <- order(sapply(term.expr, mean))
vioplot(term.expr[geno.order], col = geno.cols[geno.order],
main = paste(plot.label, "\np =", signif(p, 2)))
stripchart(term.expr[geno.order], method = "jitter", vertical = TRUE,
pch = 16, col = "#dd1c77", add = TRUE)
abline(h = 0)
invisible(term.expr)
}
We first looked at biodomains and KEGG pathways separately The plots below show the results for Biodomains.
mean.bd.vals <- non_nested_mean_vals(bd.list, expr.mat = adj_expr)
bd.result <- plot_mean_bd_k(mean.bd.k.mat = mean.bd.vals,
covar.mat = covar.table, plot.results = FALSE)
par(mar = c(4,20,2,2))
barplot(sort(bd.result$row.r2), horiz = TRUE, las = 2, xlab = "Adjusted R2",
main = "Biodomains")
The plots below show how well each biodomain separates the genotypes
geno.idx <- lapply(names(geno.cols), function(x) which(covar.table[,"climb_geno"] == x))
names(geno.idx) <- names(geno.cols)
#pdf("~/Desktop/biodomains.pdf", width = 7, height = 5)
for(bd in 1:length(bd.list)){
cat("####", names(bd.list)[bd], "\n")
geno.vals <- lapply(geno.idx, function(x) mean.bd.vals[bd,x])
geno.order <- order(sapply(geno.vals, mean))
model <- lm(mean.bd.vals[bd,]~as.factor(covar.table[,"climb_geno"]))
f <- summary(model)$fstatistic
p <- pf(f[1],f[2],f[3],lower.tail=F)
vioplot(geno.vals[geno.order], col = geno.cols[geno.order],
main = paste(names(bd.list)[bd], "\np =", signif(p, 2)))
stripchart(geno.vals[geno.order], vertical = TRUE, pch = 16, method = "jitter",
add = TRUE, col = "#dd1c77")
abline(h = 0)
cat("\n\n")
}
#dev.off()
We also looked at the subdomains. The boxplot below shows the distribution of adjusted R2 for the subdomains within each domain.
mean.sbd.vals <- nested_mean_vals(nested_gene_list = sub.bd.list, expr.mat = adj_expr,
organization = "inner")
sbd.result <- lapply(mean.sbd.vals, function(x) if(length(x) > 0){plot_mean_bd_k(x,
covar.mat = covar.table, plot.results = FALSE)}else{NA})
sbd.r2 <- lapply(sbd.result, function(x) if(length(x) > 1){x$row.r2}else{NA})
names(sbd.r2) <- names(sub.bd.list)
#r2.order <- order(sapply(sbd.r2, function(x) if(length(x) > 0){mean(x, na.rm = TRUE)})) #by biodomain mean
r2.order <- order(bd.result$row.r2) #by overall biodomain r2
par(mar = c(4,16,2,2))
boxplot(sbd.r2[r2.order], las = 2, horizontal = TRUE)
abline(v = seq(0, 0.3, 0.1), lty = 2, col = "darkgray")
points(x = bd.result$row.r2[r2.order], y = 1:length(r2.order), col = "blue",
pch = 16)
The bar plots below shows the R^2 values within each biodomain.
#pdf("~/Desktop/subdomain_r2.pdf", width = 10, height = 7)
max.r2 <- max(unlist(sbd.r2), na.rm = TRUE)
for(bd in 1:length(sbd.r2)){
if(length(sbd.r2[[bd]]) > 1){
cat("####", names(sbd.r2)[bd], "\n")
par(mar = c(4,30,2,2))
r2.vals <- sbd.r2[[bd]]
r2.vals[which(r2.vals < 0)] <- 0
barplot(sort(r2.vals), horiz = TRUE, las = 2, xlab = "Adjusted R2",
main = names(sbd.r2)[bd], xlim = c(0, max.r2))
abline(v = seq(0, max.r2, 0.1), lty = 2, col = "darkgray")
cat("\n\n")
}
}
#dev.off()
The plot below shows the top subdomains overall
all.sub.r2 <- unlist(sbd.r2)
#hist(all.sub.r2, breaks = 25)
big.r2 <- which(all.sub.r2 >= 0.15)
#pdf("~/Desktop/top_r2.pdf", width = 10, height = 8)
par(mar = c(4,35,2,2))
barplot(sort(all.sub.r2[big.r2]), horiz = TRUE, las = 2)
abline(v = seq(0, max(all.sub.r2, na.rm = TRUE), 0.1), lty = 2)
#dev.off()
The plots below show how well each subdomain separates the genotypes
#pdf("~/Desktop/subdomains.pdf", width = 7, height = 5)
for(bd in 1:length(sub.bd.list)){
cat("####", names(sub.bd.list)[bd], "{.tabset .tabset-fade .tabset-pills}\n")
if(length(sub.bd.list[[bd]]) > 0){
for(s in 1:length(sub.bd.list[[bd]])){
cat("#####", names(sub.bd.list[[bd]])[s], "\n")
geno.vals <- lapply(geno.idx, function(x) mean.sbd.vals[[bd]][s,x])
geno.order <- order(sapply(geno.vals, mean))
model <- lm(unlist(geno.vals)~as.factor(covar.table[unlist(geno.idx),"climb_geno"]))
f <- summary(model)$fstatistic
p <- pf(f[1],f[2],f[3],lower.tail=F)
vioplot(geno.vals[geno.order], col = geno.cols[geno.order],
main = paste(names(sub.bd.list)[bd], "\n", names(sub.bd.list[[bd]])[s], "\np =", signif(p, 2)))
stripchart(geno.vals[geno.order], vertical = TRUE, pch = 16, method = "jitter",
add = TRUE, col = "#dd1c77")
abline(h = 0)
cat("\n\n")
}
}
cat("\n\n")
}
#dev.off()
For each subdomain, we calculated the mean expression across the genotypes. The plots below show these means grouped by biodomain.
#pdf("~/Desktop/sub_plots.pdf", width = 12, height = 5)
sd.global.min <- min(unlist(mean.sbd.vals), na.rm = TRUE)
sd.global.max <- max(unlist(mean.sbd.vals), na.rm = TRUE)
for(bd in 1:length(mean.sbd.vals)){
if(length(mean.sbd.vals[[bd]]) > 0){
cat("####", names(sub.bd.list)[bd], "\n")
plot_mean_bd_k(mean.sbd.vals[[bd]], covar.mat = covar.table,
label.margin = 24, global.min = sd.global.min,
global.max = sd.global.max)
cat("\n\n")
}
}
#dev.off()
The plots below show results for KEGG pathways. Because there are so many KEGG pathways, the bar plot only shows the top 20 results.
mean.k.vals <- non_nested_mean_vals(kegg.list, expr.mat = adj_expr)
k.result <- plot_mean_bd_k(mean.bd.k.mat = mean.k.vals,
covar.mat = covar.table, plot.results = FALSE)
par(mar = c(4,25,2,2))
barplot(tail(sort(k.result$row.r2), 25), horiz = TRUE, las = 2, xlab = "Adjusted R2",
main = "KEGG pathways")
The plots below show how well each biodomain separates the genotypes
k.order <- order(k.result$row.r2, decreasing = TRUE)
#pdf("~/Desktop/kegg.pdf", width = 7, height = 5)
kegg.mean.mat <- matrix(NA, nrow = 25, ncol = 5)
rownames(kegg.mean.mat) <- names(k.result$row.r2)[k.order[1:25]]
colnames(kegg.mean.mat) <- names(geno.cols)
for(k in 1:25){
cat("####", names(k.result$row.r2)[k.order[k]], "\n")
geno.vals <- lapply(geno.idx, function(x) mean.k.vals[k.order[k],x])
geno.order <- order(sapply(geno.vals, mean))
model <- lm(mean.k.vals[k.order[k],]~as.factor(covar.table[,"climb_geno"]))
f <- summary(model)$fstatistic
p <- pf(f[1],f[2],f[3],lower.tail=F)
vioplot(geno.vals[geno.order], col = geno.cols[geno.order],
main = paste(names(kegg.list)[k.order[k]], "\np =", signif(p, 2)))
stripchart(geno.vals[geno.order], vertical = TRUE, pch = 16, method = "jitter",
add = TRUE, col = "#dd1c77")
abline(h = 0)
kegg.mean.mat[k,] <- sapply(geno.vals, mean)
cat("\n\n")
}
#dev.off()
The plot below gives an overview of the expression of the top KEGG pathways.
mean.decomp <- plot.decomp(t(kegg.mean.mat), label.points = TRUE)
row.order <- order(mean.decomp$v[,1])
png("~/Desktop/kegg_mean.png", width = 11, height = 7, units = "in", res = 300)
layout.mat <- matrix(c(1,2), nrow = 1)
layout(layout.mat, widths = c(1, 0.2))
par(mar = c(4,25,2,2))
imageWithText(signif(kegg.mean.mat[row.order,], 2), split.at.vals = TRUE,
col.scale = c("blue", "brown"), grad.dir = "ends", col.text.shift = 0.03,
row.text.shift = 0.15)
par(mar = c(20,4,2,4))
imageWithTextColorbar(signif(kegg.mean.mat[row.order,], 2), split.at.vals = TRUE,
col.scale = c("blue", "brown"), grad.dir = "ends", cex = 1)
dev.off()
## quartz_off_screen
## 2
#calculate mean gene expression for a given gene list
mean.bd.k.file <- file.path(results.dir, "KEGG_BD_individual_expression.RDS")
if(!file.exists(mean.bd.k.file)){
mean.bd.k.vals <- nested_mean_vals(nested_gene_list = kegg.bd.list,
expr.mat = adj_expr, "outer")
saveRDS(mean.bd.k.vals, mean.bd.k.file)
}else{
mean.bd.k.vals <- readRDS(mean.bd.k.file)
}
min.val <- min(unlist(mean.bd.k.vals), na.rm = TRUE)
max.val <- max(unlist(mean.bd.k.vals), na.rm = TRUE)
kegg.plot.dir <- file.path(results.dir, "KEGG_plots")
all.bd.k.result <- vector(mode = "list", length = length(mean.bd.k.vals))
names(all.bd.k.result) <- names(mean.bd.k.vals)
pdf(file.path(kegg.plot.dir, "kegg_bd_intersections_individuals.pdf"), width = 15, height = 7)
hist(unlist(mean.bd.k.vals), xlab = "Mean Value", main = "Mean Biodomain-Kegg Intersection Expression")
for(i in 1:length(mean.bd.k.vals)){
if(length(mean.bd.k.vals[[i]]) > 0){
all.bd.k.result[[i]] <- plot_mean_bd_k(mean.bd.k.mat = mean.bd.k.vals[[i]],
covar.mat = covar.table, plot.label = names(mean.bd.k.vals)[i], order.by = "top.rank")
}
}
dev.off()
## quartz_off_screen
## 2
all.r2 <- Reduce("rbind", lapply(all.bd.k.result, function(x) x$row.r2))
rownames(all.r2) <- names(all.bd.k.result)
all.r2[which(is.na(all.r2))] <- 0
all.r2[which(all.r2 < 0)] <- 0
row.order <- order(apply(all.r2, 1, max))
col.order <- order(apply(all.r2, 2, max))
pdf(file.path(results.dir, "allr2.pdf"), width = 9, height = 40)
pheatmap(all.r2[row.order,col.order], cluster_rows = FALSE, cluster_cols = FALSE)
dev.off()
mean.order <- order(apply(all.r2, 2, max))
par(mar = c(4, 16, 4, 2))
boxplot(all.r2[,mean.order], las = 2, horizontal = TRUE)
top.n = 10
par(mar = c(4, 24, 4, 2))
barplot(tail(sort(all.r2[,"APP Metabolism"]), top.n), las = 2, horiz = TRUE)
bd.k.genes <- unlist(kegg.bd.list, recursive = FALSE)
#bd.k.jaccard <- jaccard.matrix(bd.k.genes)
#hist(bd.k.jaccard)
kegg.bd.mat <- matrix(0, nrow = length(bd.list), ncol = length(kegg.list))
rownames(kegg.bd.mat) <- names(bd.list)
colnames(kegg.bd.mat) <- names(kegg.list)
k.bd.test <- vector(mode = "list", length = length(kegg.list))
names(k.bd.test) <- names(kegg.list)
for(i in 1:length(names(kegg.list))){
k.idx <- grep(names(kegg.list)[i], names(bd.k.genes))
#names(bd.k.genes)[k.idx]
#print(length(k.idx))
if(length(k.idx) > 0){
sub.list <- bd.k.genes[k.idx]
k.bd.test[[i]] <- unlist(sub.list)
present.bd <- sapply(strsplit(names(bd.k.genes)[k.idx], ".", fixed = TRUE), function(x) x[1])
kegg.bd.mat[present.bd,i] <- sapply(sub.list, length)
}else{
k.bd.test[[i]] <- NA
}
}
pdf("~/Desktop/intersection_counts.pdf", width = 12, height = 45)
pheatmap(t(kegg.bd.mat), display_numbers = TRUE, number_format = "%.0f")
dev.off()
bip.mat <- graph_from_biadjacency_matrix(kegg.bd.mat)
no.con <- which(degree(bip.mat) == 0)
simp.net <- delete_vertices(bip.mat, no.con)
vert.col <- rep("#7fc97f", vcount(simp.net))
vert.col[which(V(simp.net)$name %in% names(bd.list))] <- "#beaed4"
pdf("~/Desktop/adj.pdf", width = 9, height = 40)
pheatmap(sqrt(t(kegg.bd.mat)))
dev.off()
test.mat <- jaccard.matrix(k.bd.test)
pdf("~/Desktop/kegg_jaccard.pdf", width = 40, height = 40)
pheatmap(test.mat)
dev.off()
adj.mat <- test.mat
diag(adj.mat) <- 0
k.net <- graph_from_adjacency_matrix(adj.mat, mode = "undirected", weighted = TRUE)
k.deg <- colSums(test.mat)
barplot(sort(k.deg), las = 2)
pdf("~/Desktop/kegg_neg.pdf", width = 40, height = 40)
plot(k.net, vertex.size = 1)
dev.off()
barplot(sort(bd.deg), las = 2, horiz = TRUE)
mean.k.bd.vals <- nested_mean_vals(nested_gene_list = kegg.bd.list,
expr.mat = adj_expr, "inner")
names(mean.k.bd.vals) <- names(bd.list)
all.k.bd.result <- vector(mode = "list", length = length(mean.k.bd.vals))
names(all.k.bd.result) <- names(mean.k.bd.vals)
pdf(file.path(kegg.plot.dir, "kegg_bd_intersections_by_BD.pdf"), width = 12, height = 22)
for(i in 1:length(bd.list)){
all.k.bd.result[[i]] <- plot_mean_bd_k(mean.bd.k.mat = mean.k.bd.vals[[i]],
covar.mat = covar.table, plot.label = names(mean.k.bd.vals)[i])
}
dev.off()
total.r2 <- sapply(all.k.bd.result, function(x) x[[3]])
max.r2 <- sapply(all.k.bd.result, function(x) if(length(x) > 0){max(x[[2]])}else{NA})
par(mar = c(4, 16, 4, 2))
barplot(sort(unlist(total.r2)), las = 2, horiz = TRUE, xlim = c(-0.05, 0.2))
barplot(sort(unlist(max.r2)), las = 2, horiz = TRUE, xlim = c(0, 0.3))
All intersections for all individuals are shown below. This matrix does tend to separate the FC and VS genotypes from each other.
sub.vals <- merge_mean_mats(mean.bd.k.vals)
#pdf("~/Desktop/all_intersections.pdf", width = 15, height = 7)
sub.val.result <- plot_mean_bd_k(mean.bd.k.mat = sub.vals, covar.table, row.names = NULL,
order.by = "top.rank")
#dev.off()
sub.val.decomp <- sub.val.result$decomp
#barplot(cl.width, col = geno.cols)
The plot below shows that this separation is statistically significant along the first principal component of the matrix.
sub.decomp <- plot.decomp(sub.vals, plot.results = FALSE)
ind.vals <- sub.decomp$v #use the loadings on the individuals
#split loadings by genotype
ind.v <- lapply(geno.idx, function(x) ind.vals[x,1])
aov.test <- aov.by.list(ind.v)
ind.p <- aov.test$"Pr(>F)"[1]
geno.order <- order(sapply(ind.v, mean))
vioplot(ind.v[geno.order], ylab = "PC1", col = geno.cols[geno.order],
main = paste("Genotypes are distinguishable\np =", signif(ind.p, 2)))
stripchart(ind.v[geno.order], add = TRUE, vertical = TRUE, method = "jitter",
col = "#dd1c77", pch = 16)
abline(h = 0)
We did permutations to verify that this p value is drawn from a uniform distribution. We shuffled the individual labels and recalculated the ANOVA. The plot below shows the qq plot of the p value distribution compared to a uniform distribution.
rnd.aov <- vector(mode = "list", length = 1000)
for(p in 1:1000){
rnd.sample <- sample(1:ncol(sub.vals))
rnd.geno.idx <- lapply(names(geno.cols), function(x) which(covar.table[rnd.sample,"climb_geno"] == x))
names(rnd.geno.idx) <- names(geno.cols)
rnd.ind.v <- lapply(rnd.geno.idx, function(x) ind.vals[x,1])
#boxplot(rnd.ind.v)
#aov.by.list(rnd.ind.v)
rnd.aov[[p]] <- aov.by.list(rnd.ind.v)
}
rnd.p <- sapply(rnd.aov, function(x) x[,"Pr(>F)"][1])
qqunif.plot(rnd.p)
The loadings on the pathway intersections should tell us which pathways most strongly drive the separation of genotypes. The pathway intersection first two PCs are shown below. We are most interested in the pathways that vary along PC1. There is a skew toward the negative side. These are pahways that are down in FC relative to VS.
high.thresh <- get.percentile(sub.decomp$u[,1], 99)
low.thresh <- get.percentile(sub.decomp$u[,1], 1)
hist(sub.decomp$u[,1], xlab = "PC1", main = "PC1", breaks = 100)
abline(v = c(high.thresh, low.thresh), col = "red")
#plot(sort(sub.decomp$u[,1]), ylab = "PC1")
#abline(h = c(high.thresh, low.thresh))
#plot(sort(sub.decomp$u[,1]), 1:nrow(sub.decomp$u), xlab = "PC1", ylab = "Index")
#abline(v = c(high.thresh, low.thresh))
low.idx <- which(sub.decomp$u[,1] < low.thresh)
high.idx <- which(sub.decomp$u[,1] > high.thresh)
#plot(sub.decomp$u[,1], sub.decomp$u[,2], pch = 16,
# xlab = paste0("PC1 (", round(sub.decomp$var.exp[1]*100), "%)"),
# ylab = paste0("PC2 (", round(sub.decomp$var.exp[2]*100), "%)"), xlim = c(-0.15, 0.1))
#text(sub.decomp$u[high.idx,1], sub.decomp$u[high.idx,2], labels = rownames(sub.vals)[high.idx], cex = 0.7)
#text(sub.decomp$u[low.idx,1], sub.decomp$u[low.idx,2], labels = rownames(sub.vals)[low.idx], cex = 0.7)
pt.col <- rep("black", nrow(sub.decomp$u))
pt.col[c(high.idx, low.idx)] <- "red"
plot(sub.decomp$u[,1], sub.decomp$u[,2], pch = 16, col = pt.col,
xlab = paste0("PC1 (", round(sub.decomp$var.exp[1]*100), "%)"),
ylab = paste0("PC2 (", round(sub.decomp$var.exp[2]*100), "%)"), xlim = c(-0.15, 0.1))
#int.name <- "Glutathione metabolism : Proteostasis"
#pt.col <- rep("black", nrow(sub.decomp$u))
#pt.col[which(rownames(sub.vals) == int.name)] <- "red"
#plot(sub.decomp$u[,1], sub.decomp$u[,2], pch = 16, col = pt.col,
# xlab = paste0("PC1 (", round(sub.decomp$var.exp[1]*100), "%)"),
# ylab = paste0("PC2 (", round(sub.decomp$var.exp[2]*100), "%)"), xlim = c(-0.15, 0.1))
#abline(v = c(high.thresh, low.thresh))
A clearer image of these pathways is shown below.
large.path <- sub.vals[c(high.idx,low.idx),]
plot_mean_bd_k(large.path, covar.table, label.margin = 20, order.by = "top.rank")
The figure below shows the expression means for each genotype for each of these pathway intersections.
geno.idx <- lapply(names(geno.cols), function(x) which(covar.table[,"climb_geno"] == x))
names(geno.idx) <- names(geno.cols)
geno.mats <- lapply(geno.idx, function(x) large.path[,x])
geno.means <- sapply(geno.mats, rowMeans)
row.order <- hclust(dist(geno.means))$order
#test.mat <- t(apply(geno.means, 1, scale))
par(mar = c(4,20,2,2))
imageWithText(geno.means[row.order,c(2,3,1,4,5)], show.text = FALSE,
split.at.vals = TRUE, col.scale = c("blue", "brown"),
grad.dir = "ends", col.text.shift = 0.02, row.text.cex = 0.7, row.text.shift = 0.15)
#imageWithTextColorbar(geno.means, split.at.vals = TRUE, col.scale = c("blue", "brown"),
# grad.dir = "ends", cex = 1)
The decomposition below shows very nice separation of the genotypes.
mean.decomp <- plot.decomp(t(geno.means), label.points = TRUE,
cols = geno.cols, cex = 1.5, xlim = c(-0.5, 0.7))
#plot(mean.decomp$v)
#test <- plot.decomp(geno.means)
We built networks between biodomains and kegg pathways based on the pathway intersections that passed the threshold. We made one for the positive pathways (up in FC) and one for the negative pathways (down in FC).
Node size is just the degree of the node. I realize that this isn’t all that meaninful since the interactions can be a bit redundant. I’m working on developing a better system.
The edge color and width indicates the percent variance explained by genotype for the genes in that interaction.
build_net <- function(edge.names){
split.names <- strsplit(edge.names, " : ")
net.kegg <- sapply(split.names, function(x) x[1])
net.bd <- sapply(split.names, function(x) x[2])
net <- graph_from_edgelist(cbind(net.bd, net.kegg), directed = FALSE)
v.col <- rep("#7fc97f", vcount(net))
v.col[which(V(net)$name %in% names(bd.list))] <- "#beaed4"
V(net)$vertex.color <- v.col
return(net)
}
#pdf("~/Desktop/net.pdf", width = 15, height = 15)
high.net <- build_net(edge.names = rownames(sub.vals)[high.idx])
high.deg <- degree(high.net)
high.r2 <- sub.val.result$row.r2[high.idx]
ecol <- colors.from.values(high.r2, col.scale = "blue", grad.dir = "high")
plot(high.net, vertex.color = V(high.net)$vertex.color,
vertex.size = scale.between.vals(high.deg*3, 5, 40),
layout = layout_nicely, main = "Positive Pathways", edge.width = high.r2*20,
edge.color = ecol, vertex.label.cex = scale.between.vals(high.deg/2, 1, 3))
low.net <- build_net(edge.names = rownames(sub.vals)[low.idx])
low.deg <- degree(low.net)
low.r2 <- sub.val.result$row.r2[low.idx]
ecol <- colors.from.values(low.r2, col.scale = "blue", grad.dir = "high")
plot(low.net, vertex.color = V(low.net)$vertex.color,
vertex.size = scale.between.vals(low.deg*3, 5, 40),
layout = layout_nicely, main = "Negative Pathways", edge.width = low.r2*20,
edge.color = ecol, vertex.label.cex = scale.between.vals(low.deg/2, 1, 3))
#dev.off()
We looked more closely at individual intersections to get a better look at how the genes in there are expressed across the genotypes.
plot_individual_genes <- function(intersection.name = NULL, gene.list = NULL,
plot.type = c("individual", "means", "boxes", "none"), plot.label = "",
label.margin = 4, order.by = c("PC", "top.rank"), show.text = TRUE,
legend.x = NULL, legend.y = NULL){
order.by = order.by[1]
plot.type = plot.type[1]
if(is.null(intersection.name) && is.null(gene.list)){stop("At least one of intersection.name or gene.list must be specified")}
if(is.null(gene.list)){
split.name <- strsplit(intersection.name, " : ")
k.name <- split.name[[1]][1]
bd.name <- split.name[[1]][2]
bd.idx <- which(names(kegg.bd.list) == bd.name)
k.idx <- which(names(kegg.bd.list[[bd.idx]]) == k.name)
term.genes <- kegg.bd.list[[bd.idx]][[k.idx]]
plot.label <- intersection.name
}else{
term.genes <- gene.list
}
common.genes <- intersect(term.genes, colnames(adj_expr))
if(length(common.genes) == 0){
plot.text(paste("Cannot find genes for", intersection.name))
}
term.expr <- adj_expr[,common.genes,drop=FALSE]
gene.names <- mouse.genes[match(common.genes, mouse.genes[,"ensembl_gene_id"]), "external_gene_name"]
colnames(term.expr) <- gene.names
gene.means <- sapply(geno.idx, function(x) term.expr[x,,drop=FALSE])
if(ncol(gene.means[[1]]) > 1){
mean.mat <- sapply(gene.means, colMeans)
#plot.decomp(mean.mat, label.points = TRUE)
gene.decomp <- plot.decomp(mean.mat, plot.results = FALSE)
gene.order <- order(gene.decomp$u[,1])
}else{
mean.mat <- sapply(gene.means, colMeans)
mean.mat <- matrix(mean.mat, nrow = 1)
gene.order = 1
}
if(plot.type == "individual"){
plot_mean_bd_k(mean.bd.k.mat = t(term.expr[,gene.order,drop=FALSE]),
covar.mat = covar.table, plot.label = plot.label,
label.margin = label.margin, order.by = order.by)
}
if(plot.type == "means"){
imageWithText(mean.mat[gene.order,,drop=FALSE], show.text = show.text, split.at.vals = TRUE, col.scale = c("blue", "brown"),
grad.dir = "ends", row.text.shift = 0.15, col.text.shift = 0.08, col.text.rotation = 0,
col.text.adj = 0.5, main = plot.label)
}
if(plot.type == "boxes"){
plot.grouped.boxes(lapply(gene.means, function(x) x[,gene.order,drop=FALSE]),
type = "matrix", legend.x = legend.x, legend.y = legend.y,
group.cols = geno.cols, print.vals = NA, main = plot.label)
plot.dim <- par("usr")
segments(x0 = 0, x1 = plot.dim[2], y0 = 0)
}
invisible(common.genes)
}
expr_loadings <- function(term.name, gene.list, expr.mat, group.list){
term.idx <- which(names(gene.list) == term.name)
if(length(term.idx) == 0){stop("Can't find ", term.name)}
term.genes <- gene.list[[term.idx]]
common.genes <- intersect(term.genes, rownames(expr.mat))
if(length(common.genes) == 0){
plot.text(paste("Cannot find genes for ", term.name))
}
term.expr <- expr.mat[common.genes,]
gene.names <- mouse.genes[match(common.genes, mouse.genes[,"ensembl_gene_id"]), "entrezgene_id"]
rownames(term.expr) <- gene.names
group.means <- lapply(group.list, function(x) rowMeans(term.expr[,x]))
mean.diff <- group.means[[2]] - group.means[[1]]
#hist(mean.diff)
#gene.decomp <- plot.decomp(t(term.expr), label.points = TRUE)
#gene.decomp <- plot.decomp(term.expr, plot.results = FALSE)
#gene.v <- gene.decomp$v[,1]
#names(gene.v) <- gene.names
#return(gene.v)
return(mean.diff)
}
#term.name <- rownames(sub.vals)[high.idx[1]]
#term.name <- "Longevity regulating pathway : Endolysosome"
#term.name <- "Alzheimer disease : Mitochondrial Metabolism"
term.name <- "ECM-receptor interaction : Synapse"
intersection.dir <- file.path(results.dir, "Intersection_Plots")
if(!file.exists(intersection.dir)){dir.create(intersection.dir)}
#Paths with high values on PC1
pdf(file.path(intersection.dir, "High_Paths_Individuals.pdf"), width = 10, height = 5)
for(i in 1:length(high.idx)){
plot_individual_genes(intersection.name = rownames(sub.vals)[high.idx[i]],
plot.type = "individual")
}
dev.off()
## quartz_off_screen
## 2
pdf(file.path(intersection.dir, "High_Paths_Means.pdf"), width = 5, height = 7)
for(i in 1:length(high.idx)){
plot_individual_genes(intersection.name = rownames(sub.vals)[high.idx[i]], plot.type = "means")
}
dev.off()
## quartz_off_screen
## 2
pdf(file.path(intersection.dir, "High_Paths_Boxes.pdf"), width = 20, height = 5)
for(i in 1:length(high.idx)){
plot_individual_genes(intersection.nam = rownames(sub.vals)[high.idx[i]], plot.type = "boxes")
}
dev.off()
## quartz_off_screen
## 2
#Paths with low values on PC1
pdf(file.path(intersection.dir, "Low_Paths_Individuals.pdf"), width = 10, height = 5)
for(i in 1:length(low.idx)){
plot_individual_genes(intersection.nam = rownames(sub.vals)[low.idx[i]], plot.type = "individual")
}
dev.off()
## quartz_off_screen
## 2
pdf(file.path(intersection.dir, "Low_Paths_Means.pdf"), width = 5, height = 9)
for(i in 1:length(low.idx)){
plot_individual_genes(intersection.nam = rownames(sub.vals)[low.idx[i]], plot.type = "means")
}
dev.off()
## quartz_off_screen
## 2
pdf(file.path(intersection.dir, "Low_Paths_Boxes.pdf"), width = 20, height = 5)
for(i in 1:length(low.idx)){
plot_individual_genes(intersection.nam = rownames(sub.vals)[low.idx[i]], plot.type = "boxes")
}
dev.off()
## quartz_off_screen
## 2
#get all top paths that have a given term, and look at
#expression of those genes
#this function looks at the top pathways (either positive or negative)
#It collect all the genes that fall under a single term across multiple
#interactions, for example, all KEGG-Biodomain interactions that involve
#Synapse. It plots the gene level statistics for all the genes, as well
#as the enrichment for those genes. This might help untangle synaptic
#genes that are upregulated in VS vs. those that are downregulated, for
#example.
#Positive and negative are sort of arbitraty because they depend
#on the sign of the PC, but you can tell from the plots whether
#VS is up or down
plot_merged_list <- function(search.term, path.type = c("Positive", "Negative")){
if(path.type == "Positive"){
app.idx <- grep(search.term, rownames(sub.vals)[high.idx]); name.list <- rownames(sub.vals)[high.idx]
}else{
app.idx <- grep(search.term, rownames(sub.vals)[low.idx]); name.list <- rownames(sub.vals)[low.idx]
}
if(length(app.idx) == 0){
stop(paste("I couldn't find", search.term))
}
app.int <- name.list[app.idx]
print(app.int)
app.list <- lapply(app.int, function(x) plot_individual_genes(x, plot.type = "none"))
app.genes <- Reduce("union", app.list)
app.result <- plot_individual_genes(gene.list = app.genes,
plot.label = paste(search.term, "genes in", path.type, "Pathways"))
app.enrich <- gost(app.genes, organism = "mmusculus",
sources = c("GO", "KEGG", "REACTOME", "CORUM", "HP"))
quartz()
plot.enrichment(app.enrich, num.terms = 30,
plot.label = paste("Enrichment of", search.term, "genes in", path.type, "Pathways"))
}
plot_merged_list("Synapse", "Positive"); #cell adhesion
plot_merged_list("Synapse", "Negative"); #ribosome, translation at synapse
test.genes <- plot_individual_genes(intersection.nam = "Ras signaling pathway : APP Metabolism")
enrich <- gost(test.genes, organism = "mmusculus", sources = c("GO", "KEGG", "REACTOME", "CORM", "HP"))
par(mar = c(4,20, 2,2))
plot.enrichment(enrich, num.terms = 30, max.char = 100)
#cat(test.genes, sep = "\n")
I think a lot of these terms have similar genes in them. The code blow generates Jaccard index matrices and plots them to the Intersection Plot folder. The names are too long to plot in the html.
Interestingly, “Ribosome : Proteostasis” and “Ribosome : Structural stabilization” are basically the same set of genes, but other than that, the lists are fairly independent. There is a lot of overlap among the mitochondrial gene pathway intersections and the ribosome pathway intersections, but not perfect overlap.
There is not a lot of overlap among the ox-phos pathway intersections. The “Retrograde endocannabanoid signaling : Mitochondrial metabolism” intersection is also fairly distinct from the other Mitochondrial metabolism pathway intersections.
high.genes <- vector(mode = "list", length = length(high.idx))
names(high.genes) <- rownames(sub.vals)[high.idx]
for(i in 1:length(high.idx)){
high.genes[[i]] <- plot_individual_genes(intersection.name = rownames(sub.vals)[high.idx[i]], plot.type = "none")
}
high.gene.jaccard <- jaccard.matrix(high.genes)
pdf(file.path(intersection.dir, "High_Paths_Jaccard.pdf"), width = 10, height = 10)
pheatmap(high.gene.jaccard)
dev.off()
## pdf
## 3
low.genes <- vector(mode = "list", length = length(low.idx))
names(low.genes) <- rownames(sub.vals)[low.idx]
for(i in 1:length(low.idx)){
low.genes[[i]] <- plot_individual_genes(intersection.nam = rownames(sub.vals)[low.idx[i]], plot.type = "none")
}
low.gene.jaccard <- jaccard.matrix(low.genes)
pdf(file.path(intersection.dir, "Low_Paths_Jaccard.pdf"), width = 12, height = 12)
pheatmap(low.gene.jaccard)
dev.off()
## pdf
## 3
In the networks, biodomains are linked through KEGG pathways. The biodomains were designed to be as non-overlapping as possible. However, we know that they are correlated both through gene expression correlations and through interacting biology. The connections to KEGG pathways might give us a sense about how different aspects of Alzheimer’s disease are related to each other. By identifying genes that are differentially expressed across the genotypes and link biodomains together, we might be able to identify shared biology across the domains and identify causal factors that affect multiple pieces of the pie.
Below we investigate genes that link biodomains through KEGG pathways.
bd.names <- c("Synapse", "Apoptosis", "Vasculature"); kegg.names <- c("ECM-receptor interaction")
#bd.names <- c("Immune Response", "Synapse"); kegg.names <- c("Ribosome")
#bd.names <- c("Lipid Metabolism"); kegg.names <- c("Bacterial invasion of epithelial cells", "Adherens junction")
term.pairs <- bipartite_pairs(bd.names, kegg.names, "Biodomains", "KEGG")
pair.genes <- vector(mode = "list", length = nrow(term.pairs))
names(pair.genes) <- apply(term.pairs, 1, function(x) paste(x, collapse = " : "))
for(i in 1:nrow(term.pairs)){
bd.idx <- which(names(kegg.bd.list) == term.pairs[i,"Biodomains"])
k.idx <- which(names(kegg.bd.list[[bd.idx]]) == term.pairs[i,"KEGG"])
pair.genes[[i]] <- kegg.bd.list[[bd.idx]][[k.idx]]
}
plotVenn(pair.genes)
## Loading required package: VennDiagram
## Loading required package: futile.logger
The plot below shows the stats for the genes in the three-way intersection.
connector.genes <- Reduce("intersect", pair.genes)
#cat(connector.genes, sep = "\n")
#mousemine GO network
#put the connector genes into mouse mine, filter the GO term table
#to relevant GO terms, and download
#plot the genes and GO terms as a network
#It will probably be messy, but it is a way to see
#which genes in a list are involved in relevant functions
#for example ribosomal genes that are at the synapse and involved
#in immune function.
#mouse.table <- read.delim("~/Downloads/MFeature_GO_test.tsv", header = FALSE)
#pdf("~/Desktop/go_net.pdf", width = 20, height = 20)
#mousemine_network(mouse.table, gene.col = 2, min.cex = 2, min.v.size = 3, max.v.size = 10)
#dev.off()
#png(file.path(results.dir, "connector.png"), width = 10, height = 5, units = "in", res = 300)
plot_individual_genes(gene.list = connector.genes,
plot.type = "individual", plot.label = "", label.margin = 4, order.by = "PC")
#dev.off()
#connector.enrich <- gost(connector.genes, organism = "mmusculus", sources = c("GO", "KEGG", "REACTOME", "HP", "CORUM"))
#plot.enrichment(connector.enrich, num.terms = 30, max.term.size = 3000)
#cat(connector.genes, sep = "\n")
#plot_individual_genes(gene.list = connector.genes, plot.type = "boxes", plot.label = "", legend.x = 36, legend.y = 4)
#plot_individual_genes(gene.list = connector.genes, plot.type = "means", plot.label = "")
#gene_vioplot("Itga4")
#gene_vioplot("Itgav")
#gene_vioplot("Itgb1")
#gene_vioplot("Itgb3")
#gene_vioplot("Fn1")
#gene_vioplot("Rpl13a")
#kl.targets <- c("Fgf23", "Fgfr1", "Fgfr2", "Fgfr3", "Fgfr4", "Klb", "Lctl", "Trpv5")
#par(mfrow = c(2,4))
#for(i in 1:length(kl.targets)){
# gene_vioplot(kl.targets[i])
#}
library(pathview)
kegg.file <- here("Data", "general", "KEGG.Mouse.RDS")
if(!file.exists(kegg.file)){
all.kegg <- download_KEGG("mmu", "KEGG", "kegg")
saveRDS(all.kegg, kegg.file)
}else{
all.kegg <- readRDS(kegg.file)
}
u_path <- gsub(" - Mus musculus (house mouse)", "", all.kegg[[2]][,"to"], fixed = TRUE)
# build GSEA list
path.id <- all.kegg[[2]][,1]
path.idx <- lapply(path.id, function(x) which(all.kegg[[1]][,1] == unlist(x)[1]))
path.gene.id <- lapply(path.idx, function(x) all.kegg[[1]][x,2])
names(path.gene.id) <- u_path
path.gene.ensembl <- lapply(path.gene.id,
function(x) mouse.genes[match(x, mouse.genes[,"entrezgene_id"]), "ensembl_gene_id"])
comp_groups <- list("FC" = grep("FC", covar.table[,"climb_geno"]), "VS" = grep("VS", covar.table[,"climb_geno"]))
#int.name <- rownames(sub.vals)[high.idx[1]]
#int.name <- "Longevity regulating pathway : Endolysosome"
#int.name <- "Alzheimer disease : Mitochondrial Metabolism"
kegg.name <- "Phagosome"
kegg.name <- "Cell adhesion molecules"
kegg.name <- "Longevity regulating pathway"
kegg.name <- "Ribosome"
kegg.name <- "Oxidative phosphorylation"
kegg.name <- "Proteasome"
kegg.name <- "ECM-receptor interaction"
kegg.name <- "Glutamatergic synapse"
expr.vals <- expr_loadings(kegg.name, kegg.list, t(adj_expr), comp_groups)
#hist(expr.vals)
scaled.vals <- scale.between.vals(expr.vals, -1, 1)
#hist(scaled.vals)
#plot(expr.vals, scaled.vals);abline(h = 0, v = 0)
path.num <- path.id[which(u_path == kegg.name)]
pv.out <- pathview(gene.data = scaled.vals,
pathway.id = path.num, species = "mmu",
out.suffix = kegg.name,
kegg.dir = intersection.dir)
for(k in 1:length(kegg.list)){
kegg.vals <- expr_loadings(names(kegg.list)[i], kegg.list, t(adj_expr), comp_groups)
boxplot(kegg.vals)
}
all.kegg.diff <- lapply(names(kegg.list), function(x) expr_loadings(x, kegg.list, t(adj_expr), comp_groups))
diff.mean <- sapply(all.kegg.diff, mean)
boxplot(all.kegg.diff[order(diff.mean)])
abline(h = 0)
names(kegg.list)[head(order(diff.mean))]
names(kegg.list)[tail(order(diff.mean))]
common.kegg <- lapply(kegg.list, function(x) intersect(x, colnames(adj_expr)))
kegg.means <- sapply(common.kegg, function(x) rowMeans(adj_expr[,x]))
plot_mean_bd_k(t(kegg.means), covar.table)
kegg.decomp <- plot.decomp(t(kegg.means))
kegg.vals <- kegg.decomp$u[,1]
low.thresh <- get.percentile(kegg.vals, 1)
high.thresh <- get.percentile(kegg.vals, 99)
high.kegg <- which(kegg.vals >= high.thresh)
low.kegg <- which(kegg.vals <= low.thresh)
#names(kegg.list)[high.kegg]
#names(kegg.list)[low.kegg]
extreme.kegg <- names(kegg.list)[c(high.kegg, low.kegg)]
for(k in 1:length(extreme.kegg)){
expr.vals <- expr_loadings(extreme.kegg[k], kegg.list, t(adj_expr), comp_groups)
#print(length(expr.vals))
if(length(expr.vals) > 10){
#hist(expr.vals)
scaled.vals <- scale.between.vals(expr.vals, -1, 1)
#hist(scaled.vals)
#plot(expr.vals, scaled.vals);abline(h = 0, v = 0)
path.num <- path.id[which(u_path == extreme.kegg[k])]
pv.out <- try(pathview(gene.data = scaled.vals,
pathway.id = path.num, species = "mmu",
out.suffix = extreme.kegg[k],
kegg.dir = intersection.dir))
}
}
#here we plot violin plots of specific terms we are interested in.
pdf("~/Desktop/mapk.pdf", width = 7, height = 5)
term_vioplot(outer.term = "MAPK signaling pathway", inner.term = "Apoptosis",
mean.term.vals = mean.bd.k.vals)
term_vioplot(outer.term = "MAPK signaling pathway", inner.term = "DNA Repair",
mean.term.vals = mean.bd.k.vals)
dev.off()
term_vioplot(outer.term = "Type II diabetes mellitus", mean.term.vals = mean.k.vals)
term_vioplot(outer.term = "Proteasome", mean.term.vals = mean.k.vals)
#This block look for gene sets and plots their expression
#and decomposition by individual
#search.term <- c("Itga", "Itgb")
#search.term <- c("Fgf")
#search.term = "Grin"
search.term = "Mapk"
plot.genes <- unlist(lapply(search.term, function(x) mouse.genes[grep(x, mouse.genes[,"external_gene_name"]),"external_gene_name"]))
length(plot.genes)
gene.id <- mouse.genes[which(mouse.genes[,"external_gene_name"] %in% plot.genes),"ensembl_gene_id"]
common.genes <- intersect(colnames(adj_expr), gene.id)
names(common.genes) <- mouse.genes[match(common.genes, mouse.genes[,"ensembl_gene_id"]),"external_gene_name"]
gene.expr <- lapply(geno.idx, function(x) adj_expr[x,common.genes])
plot_individual_genes(gene.list = common.genes, order.by = "top.rank")
plot_individual_genes(gene.list = common.genes["Grin2b"], order.by = "top.rank")
for(i in 1:length(gene.expr)){
colnames(gene.expr[[i]]) <- names(common.genes)
}
plot.grouped.boxes(gene.expr, type = "matrix", group.cols = geno.cols, main = search.term,
print.vals = NA)
plot.dim <- par("usr")
segments(x0 = plot.dim[1], x1 = plot.dim[2], y0 = 0)
#This block looks at correlations between genes in
#a given intersection
int.genes <- plot_individual_genes(intersection.name = "ECM-receptor interaction : Synapse")
int.names <- mouse.genes[match(int.genes, mouse.genes[,"ensembl_gene_id"]),"external_gene_name"]
int.expr <- adj_expr[,int.genes]
colnames(int.expr) <- int.names
imageWithText(signif(cor(int.expr), 2))
term.name = "Ribosome"
term.idx <- which(names(kegg.list) == term.name)
plot_individual_genes(gene.list = kegg.list[[term.idx]],
plot.type = "individual", plot.label = "", label.margin = 4)
kegg.vals <- non_nested_mean_vals(kegg.list, adj_expr)
kegg.result <- plot_mean_bd_k(kegg.vals, covar.mat = covar.table, plot.label = "KEGG")
kegg.r2 <- kegg.result$row.r2
par(mar = c(4,22,2,2))
barplot(tail(sort(kegg.r2), 20), las = 2, horiz = TRUE)
#this block looks at enrichments of particular interactions
#in the KEGG-Biodomain network
term.name <- "ECM-receptor interaction : Synapse"
term.name <- "Cell adhesion molecules : Synapse"
term.name <- "Type II diabetes mellitus : Synapse"
term.name <- "Parkinson disease : DNA Repair"
term.name <- "Ras signaling pathway : APP Metabolism"
int.genes <- plot_individual_genes(intersection.name = term.name)
int.enrich <- gost(int.genes, organism = "mmusculus", sources = c("GO", "KEGG", "REACTOME", "HP", "CORUM"))
plot.enrichment(int.enrich, max.term.size = 3000, num.terms = 30, plot.label = term.name)
#checking results against
#Morphological and biochemical signs of age-related neurodegenerative changes in klotho mutant mice
test.genes <- c("Nefh", "Nefm", "Nefl", "Ina", "Prph", "Nes") #neurofilament genes
test.genes <- c("Map2") #increased in Klotho-deficient mice
test.genes <- "Ctsd" #lysosomal gene
test.genes <- c("Map1a", "Map1b")
test.genes <- "Bcl2l1" #antiapoptotic
test.genes <- "Bax" #proapoptotic
test.genes <- "Mapk1"
test.genes <- c("Gfap", "Vim", "Synm", "Nes") #astrocyte microfilament genes
test.genes <- c("Apoe", "Map3k12", "Mapk27", "Mapk1", "Fos", "App", "Ldlr")
test.genes <- c("Apoe", "App", "Kl")
test.id <- mouse.genes[match(test.genes, mouse.genes[,"external_gene_name"]), "ensembl_gene_id"]
plot_individual_genes(gene.list = test.id, order.by = "top.rank",
plot.type = "individual", plot.label = "", label.margin = 4)
plot_individual_genes(gene.list = test.id,
plot.type = "boxes", plot.label = "", label.margin = 4)
common.genes <- intersect(test.id, colnames(adj_expr))
common.names <- mouse.genes[match(common.genes, mouse.genes[,"ensembl_gene_id"]), "external_gene_name"]
expr.mat <- adj_expr[,common.genes]
colnames(expr.mat) <- common.names
pheatmap(cor(expr.mat), display_numbers = TRUE)
gene1 <- "Apoe"; gene2 <- "App"
pt.col <- rep(NA, nrow(covar.table))
for(i in 1:length(geno.cols)){
pt.col[which(covar.table[,"climb_geno"] == names(geno.cols)[i])] <- geno.cols[i]
}
plot.with.model(expr.mat[,gene1], expr.mat[,gene2], xlab = gene1, ylab = gene2, col = pt.col)
xlim = c(min(expr.mat[,gene1]), max(expr.mat[,gene1]))
ylim = c(min(expr.mat[,gene2]), max(expr.mat[,gene2]))
pdf("~/Desktop/gene_comparison.pdf", width = 5, height = 8)
layout.mat <- matrix(c(6,1,2,3,4,5), ncol = 2, byrow = TRUE)
layout(layout.mat)
for(i in 1:length(geno.idx)){
plot.with.model(expr.mat[geno.idx[[i]],gene1], expr.mat[geno.idx[[i]],gene2],
xlab = gene1, ylab = gene2, col = pt.col[geno.idx[[i]]], xlim = xlim, ylim = ylim,
main = names(geno.idx)[i], cex = 1.5, cex.lab = 1.5)
}
#mtext(paste(gene1, "and", gene2), side = 3, outer = TRUE, line = -1.5, font = 2)
plot.with.model(expr.mat[,gene1], expr.mat[,gene2], xlab = gene1, ylab = gene2,
col = pt.col, cex = 1.5, cex.lab = 1.5, main = "All")
dev.off()
#I checked WGCNA. Unfortunately, it didn't really yield much.
library(WGCNA)
enableWGCNAThreads()
powers = c(c(1:10), seq(from = 12, to=20, by=2))
sft = pickSoftThreshold(adj_expr, powerVector = powers, verbose = 5)
use.power <- sft$powerEstimate
if(is.na(use.power)){use.power = 6}
use.power
#signed if no negative values
#unsigned if negative values are included
#type = "unsigned"
type = "signed"
net = blockwiseModules(adj_expr, power = use.power,
TOMType = type, minModuleSize = 10,
reassignThreshold = 0, mergeCutHeight = 0.25,
numericLabels = TRUE, pamRespectsDendro = FALSE,
saveTOMs = FALSE,verbose = 3)
mergedColors = labels2colors(net$colors)
modules <- net$colors
num.wgcna.modules <- length(unique(modules))
u_mods <- sort(unique(modules))
mod.size <- sapply(u_mods, function(x) length(which(modules == x)))
barplot(mod.size)
enrich.file <- file.path(results.dir, paste0("WGCNA_Module_Enrichment_", type, ".RDS"))
if(!file.exists(enrich.file)){
mod.enrich <- lapply(u_mods, function(x) gost(names(which(modules == x)),
organism = "mmusculus", sources = c("GO", "KEGG", "REACTOME", "CORUM", "HP")))
saveRDS(mod.enrich, enrich.file)
}else{
mod.enrich <- readRDS(enrich.file)
}
pdf("~/Desktop/enrichment.pdf", width = 9, height = 40)
plot.enrichment.group(mod.enrich, max.term.size = 3000, transformation = "sqrt")
dev.off()
wgcna.list <- lapply(u_mods, function(x) names(which(modules == x)))
gene.r2 <- vector(mode = "list", length = length(wgcna.list))
mod.r2 <- rep(NA, length(wgcna.list))
for(i in 1:length(u_mods)){
print(i)
ind.genes <- lapply(wgcna.list[[i]], function(x) x)
wgcna.vals <- non_nested_mean_vals(ind.genes, adj_expr)
rownames(wgcna.vals) <- unlist(ind.genes)
wgcna.result <- plot_mean_bd_k(wgcna.vals, covar.mat = covar.table, plot.results = FALSE)
gene.r2[[i]] <- wgcna.result$row.r2
mod.r2[i] <- wgcna.result$overall.r2
}
barplot(sort(mod.r2))
mod.order <- order(mod.r2, decreasing = TRUE)
i = 1
ind.result <- plot_individual_genes(gene.list = names(which(modules == u_mods[mod.order[i]])))
quartz()
plot.enrichment(mod.enrich[[u_mods[high.pred[i]]]], max.term.size = 3000)
hist(unlist(gene.r2))
sorted.genes <- sort(unlist(gene.r2), decreasing = TRUE)
plot(sorted.genes)
#This gives basically the same results to other analyses.
library(fgsea)
#gsea.enrich <- fgsea::fgsea(kegg.list, sorted.genes, scoreType = "pos")
#gsea.enrich <- fgsea::fgsea(bd.list, sorted.genes, scoreType = "pos")
gsea.enrich <- fgsea::fgsea(unlist(kegg.bd.list, recursive = FALSE), sorted.genes, scoreType = "pos")
norm.es <- as.numeric(as.matrix(gsea.enrich[,"NES"]))
names(norm.es) <- gsea.enrich$pathway
par(mar = c(4,24, 2,2))
barplot(tail(sort(norm.es), 19), las = 2, horiz = TRUE)